import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
pd.options.display.float_format = '{:.3f}'.format
# VARS
dir_diversity_output = '../results_diversity'
dir_reads_fastq = '../data/control_sample/'
fastq_basename = 'POOL.fastq.gz'
dir_results_profiling = '../results_profiling'
pools_file="../data/EM_EVPools/samples_profiling.txt"
today = datetime.today().strftime('%Y-%m-%d')
os.makedirs(f'{dir_diversity_output}/{today}', exist_ok=True)
# GENERAL VARIABLES
POOL_list = !cat {pools_file}
POOL_list += ['ACIDOLA', 'BLACTIS']
# Attributes
cutoff_NA_ratio = 0.35
# Create the pooled dataframe. We are going to separate mean and percentage to have some representation of two variables.
# The dataframe we are going to show has values of all samples using the raw dataframe, and not the cutoff one. However, to do the filtering we
# are going to use the cutoff one, because for some species that have discordant read values, in many pools they are discarded, but not in all of them; so
# when doing the heatmap here, they appear on top, when in reality they should have been discarded for not appearing in many datasets
list_dfs_means_raw, list_dfs_per_raw = [], []
list_dfs_means_cutoff, list_dfs_per_raw_cutoff = [], []
list_selected_index = []
for POOL in POOL_list:
df_POOL_cutoff = pd.read_csv(f'{dir_diversity_output}/{today}/{POOL}.diversity_cutoff.tsv', sep='\t', index_col='Unnamed: 0')
df_POOL_cutoff.reset_index(inplace=True)
df_POOL_cutoff = df_POOL_cutoff[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
df_POOL_cutoff['taxon - genus'] = df_POOL_cutoff['index'].astype(str) + ' - ' + df_POOL_cutoff['name']
df_POOL_cutoff = df_POOL_cutoff.set_index('taxon - genus')
df_POOL_raw = pd.read_csv(f'{dir_diversity_output}/{today}/{POOL}.diversity_raw.tsv', sep='\t', index_col='Unnamed: 0')
df_POOL_raw.reset_index(inplace=True)
df_POOL_raw = df_POOL_raw[['index', 'name', 'mean (%)', 'mean']].rename(columns = {'mean (%)': f'mean (%) {POOL}', 'mean': f'mean {POOL}'})
df_POOL_raw['taxon - genus'] = df_POOL_raw['index'].astype(str) + ' - ' + df_POOL_raw['name']
df_POOL_raw = df_POOL_raw.set_index('taxon - genus')
list_dfs_means_raw.append(df_POOL_raw[f'mean {POOL}'])
list_dfs_per_raw.append(df_POOL_raw[f'mean (%) {POOL}'])
list_dfs_means_cutoff.append(df_POOL_cutoff[f'mean {POOL}'])
list_dfs_per_raw_cutoff.append(df_POOL_cutoff[f'mean (%) {POOL}'])
list_selected_index += df_POOL_cutoff.index.tolist()
selected_index = list(set(list_selected_index))
df_mean_raw, df_per_raw = pd.concat(list_dfs_means_raw, axis=1), pd.concat(list_dfs_per_raw, axis=1)
df_mean_raw = df_mean_raw.loc[selected_index]
df_per_raw = df_per_raw.loc[selected_index]
df_mean_cutoff, df_per_cutoff = pd.concat(list_dfs_means_cutoff, axis=1), pd.concat(list_dfs_per_raw_cutoff, axis=1)
df_mean_cutoff = df_mean_cutoff.loc[selected_index]
df_per_cutoff = df_per_cutoff.loc[selected_index]
# NA cut to keep only species that have only a set of values as NAs
nonNA_index = df_mean_cutoff[df_mean_cutoff.isna().sum(1) < int(cutoff_NA_ratio * len(POOL_list))].index
# Then we order by the median of the values (using mean skewed some species much present in a few samples)
df_mean_cutoff_nonNA = df_mean_cutoff.loc[nonNA_index]
df_mean_cutoff_nonNA = df_mean_cutoff_nonNA.assign(m=df_mean_cutoff_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_cutoff_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_cutoff_nonNA.tsv', sep='\t')
df_per_cutoff_nonNA = df_per_cutoff.loc[nonNA_index]
df_per_cutoff_nonNA = df_per_cutoff_nonNA.assign(m=df_per_cutoff_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_cutoff_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_cutoff_nonNA.tsv', sep='\t')
# Then do the same in raw, but only with cutoff samples
df_mean_raw_cutoffindex_nonNA = df_mean_raw.loc[df_mean_cutoff_nonNA.index.values]
df_mean_raw_cutoffindex_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_raw_cutoffindex_nonNA.tsv', sep='\t')
df_per_raw_cutoffindex_nonNA = df_per_raw.loc[df_per_cutoff_nonNA.index.values]
df_per_raw_cutoffindex_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_raw_cutoffindex_nonNA.tsv', sep='\t')
# Then do the same in raw as with cutoff
df_mean_raw_nonNA = df_mean_raw
df_mean_raw_nonNA = df_mean_raw_nonNA.assign(m=df_mean_raw_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_mean_raw_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_mean_raw_nonNA.tsv', sep='\t')
df_per_raw_nonNA = df_per_raw
df_per_raw_nonNA = df_per_raw_nonNA.assign(m=df_per_raw_nonNA.median(axis=1)).sort_values('m', ascending=False).drop('m', axis=1)
df_per_raw_nonNA.to_csv(f'{dir_diversity_output}/{today}/df_per_raw_nonNA.tsv', sep='\t')
df_mean_raw_nonNA
| mean POOL1 | mean POOL2 | mean POOL3 | mean POOL4 | mean POOL5 | mean POOL6 | mean POOL7 | mean POOL8 | mean POOL9 | mean POOL10 | mean POOL11 | mean POOL12 | mean ACIDOLA | mean BLACTIS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | ||||||||||||||
| 9605 - Homo | 1709929.998 | 1933932.908 | 1288302.626 | 1523916.538 | 1034480.997 | 570833.058 | 1050951.429 | 1048025.882 | 1049918.995 | 1260704.944 | 1109145.249 | 1321520.166 | 2305.690 | 288.340 |
| 36034 - Saccharomycodes | 81123.066 | 59855.630 | 47695.527 | 62199.371 | 45876.058 | 19458.106 | 37345.758 | 30799.966 | 52090.312 | 61101.690 | 34952.457 | 65185.823 | 6.961 | 3.252 |
| 1912216 - Cutibacterium | 11290.359 | 51032.855 | 18299.991 | 21386.945 | 16490.551 | 22427.767 | 23451.886 | 21068.794 | 32960.381 | 9649.674 | 17538.432 | 38091.691 | 480.752 | 73.982 |
| 37914 - Dietzia | 10825.673 | 52825.661 | 17149.516 | 8769.884 | 20165.245 | 28191.640 | 45763.874 | 15913.459 | 46163.752 | 8122.847 | 6675.907 | 40384.093 | NaN | NaN |
| 13687 - Sphingomonas | 4688.349 | 29033.744 | 12101.603 | 6543.925 | 17594.253 | 25613.385 | 28941.491 | 20731.628 | 31299.074 | 6169.392 | 6310.884 | 29744.999 | 13.303 | 1.084 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2948645 - Burzaovirus | NaN | NaN | 1.976 | NaN | NaN | 283.320 | 0.866 | NaN | NaN | NaN | NaN | NaN | 1.856 | NaN |
| 1184396 - Mesotoga | 1.705 | 3.427 | 2.964 | NaN | 1.079 | 1080.139 | 1.731 | 9.744 | 1.125 | 1.313 | NaN | NaN | NaN | 1.084 |
| 499228 - Tepidanaerobacter | NaN | 1.142 | 20.749 | NaN | 1.079 | 105.992 | 1.443 | 313.814 | 1.687 | 6.420 | 1.157 | 1.194 | NaN | NaN |
| 2082587 - Parolsenella | 2.558 | 1.142 | 56.812 | 1.023 | 1.079 | 1289.236 | 0.866 | 540.271 | 1.125 | 0.875 | 8.101 | 5.971 | 1.856 | 1.084 |
| 51196 - Syntrophobotulus | NaN | 1.142 | NaN | 1.023 | 74.964 | 428.590 | 0.866 | 112.220 | 1.125 | NaN | 5.208 | 1.194 | NaN | NaN |
817 rows × 14 columns
df_mean_raw_cutoffindex_nonNA
| mean POOL1 | mean POOL2 | mean POOL3 | mean POOL4 | mean POOL5 | mean POOL6 | mean POOL7 | mean POOL8 | mean POOL9 | mean POOL10 | mean POOL11 | mean POOL12 | mean ACIDOLA | mean BLACTIS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | ||||||||||||||
| 9605 - Homo | 1709929.998 | 1933932.908 | 1288302.626 | 1523916.538 | 1034480.997 | 570833.058 | 1050951.429 | 1048025.882 | 1049918.995 | 1260704.944 | 1109145.249 | 1321520.166 | 2305.690 | 288.340 |
| 1912216 - Cutibacterium | 11290.359 | 51032.855 | 18299.991 | 21386.945 | 16490.551 | 22427.767 | 23451.886 | 21068.794 | 32960.381 | 9649.674 | 17538.432 | 38091.691 | 480.752 | 73.982 |
| 13687 - Sphingomonas | 4688.349 | 29033.744 | 12101.603 | 6543.925 | 17594.253 | 25613.385 | 28941.491 | 20731.628 | 31299.074 | 6169.392 | 6310.884 | 29744.999 | 13.303 | 1.084 |
| 37914 - Dietzia | 10825.673 | 52825.661 | 17149.516 | 8769.884 | 20165.245 | 28191.640 | 45763.874 | 15913.459 | 46163.752 | 8122.847 | 6675.907 | 40384.093 | NaN | NaN |
| 469 - Acinetobacter | 2772.345 | 11158.895 | 7052.537 | 9496.731 | 13154.365 | 23846.245 | 14577.205 | 80605.920 | 21943.776 | 15249.816 | 3400.060 | 15808.468 | 36.505 | 45.527 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 724 - Haemophilus | 97.470 | 969.743 | 204.851 | 294.488 | 659.759 | 4195.865 | 1048.730 | 519.103 | 439.022 | 248.185 | 263.472 | 6373.544 | 0.928 | NaN |
| 482 - Neisseria | 440.461 | 2410.079 | 156.767 | 343.314 | 584.614 | 1009.670 | 356.020 | 1045.263 | 827.701 | 194.054 | 362.226 | 15916.843 | 3.248 | NaN |
| 497 - Psychrobacter | 193.519 | 255.476 | 673.505 | 414.124 | 3318.928 | 455.738 | 484.118 | 1028.127 | 1373.220 | 291.227 | 289.318 | 1429.082 | 0.928 | NaN |
| 84567 - Pedobacter | 138.674 | 359.418 | 362.606 | 390.606 | 602.592 | 566.640 | 384.005 | 526.159 | 774.733 | 201.349 | 329.436 | 723.696 | 2.784 | NaN |
| 2040 - Aeromicrobium | 57.970 | 664.009 | 265.779 | 348.000 | 356.306 | 590.467 | 604.715 | 283.575 | 578.238 | 221.193 | 263.472 | 807.292 | NaN | 1.084 |
102 rows × 14 columns
df_mean_cutoff_nonNA
| mean POOL1 | mean POOL2 | mean POOL3 | mean POOL4 | mean POOL5 | mean POOL6 | mean POOL7 | mean POOL8 | mean POOL9 | mean POOL10 | mean POOL11 | mean POOL12 | mean ACIDOLA | mean BLACTIS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | ||||||||||||||
| 9605 - Homo | 1709929.998 | 1933932.908 | 1288302.626 | 1523916.538 | 1034480.997 | 570833.058 | 1050951.429 | 1048025.882 | 1049918.995 | 1260704.944 | 1109145.249 | 1321520.166 | 2305.690 | 288.340 |
| 1912216 - Cutibacterium | 11290.359 | 51032.855 | 18299.991 | 21386.945 | 16490.551 | 22427.767 | 23451.886 | 21068.794 | 32960.381 | 9649.674 | 17538.432 | 38091.691 | 480.752 | NaN |
| 13687 - Sphingomonas | 4688.349 | 29033.744 | 12101.603 | 6543.925 | 17594.253 | 25613.385 | 28941.491 | 20731.628 | 31299.074 | 6169.392 | 6310.884 | 29744.999 | NaN | NaN |
| 37914 - Dietzia | 10825.673 | 52825.661 | 17149.516 | 8769.884 | 20165.245 | 28191.640 | 45763.874 | 15913.459 | 46163.752 | 8122.847 | 6675.907 | 40384.093 | NaN | NaN |
| 469 - Acinetobacter | 2772.345 | 11158.895 | 7052.537 | 9496.731 | 13154.365 | 23846.245 | 14577.205 | 80605.920 | 21943.776 | 15249.816 | 3400.060 | 15808.468 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 724 - Haemophilus | NaN | 969.743 | 204.851 | 294.488 | 659.759 | 4195.865 | 1048.730 | 519.103 | 439.022 | 248.185 | 263.472 | 6373.544 | NaN | NaN |
| 482 - Neisseria | 440.461 | 2410.079 | 156.767 | 343.314 | 584.614 | 1009.670 | 356.020 | 1045.263 | 827.701 | 194.054 | 362.226 | 15916.843 | NaN | NaN |
| 497 - Psychrobacter | 193.519 | 255.476 | 673.505 | 414.124 | 3318.928 | 455.738 | 484.118 | 1028.127 | 1373.220 | 291.227 | 289.318 | 1429.082 | NaN | NaN |
| 84567 - Pedobacter | 138.674 | 359.418 | 362.606 | 390.606 | 602.592 | 566.640 | 384.005 | 526.159 | 774.733 | 201.349 | 329.436 | 723.696 | NaN | NaN |
| 2040 - Aeromicrobium | NaN | 664.009 | 265.779 | 348.000 | 356.306 | 590.467 | 604.715 | 283.575 | 578.238 | 221.193 | 263.472 | 807.292 | NaN | NaN |
102 rows × 14 columns
N = 25
fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_raw_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
fig, ax = plt.subplots(1, 1, figsize=(9, 7))
sns.heatmap(np.log10(df_mean_cutoff_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]
| mean POOL1 | mean POOL2 | mean POOL3 | mean POOL4 | mean POOL5 | mean POOL6 | mean POOL7 | mean POOL8 | mean POOL9 | mean POOL10 | mean POOL11 | mean POOL12 | mean ACIDOLA | mean BLACTIS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | ||||||||||||||
| 1912216 - Cutibacterium | 11290.359 | 51032.855 | 18299.991 | 21386.945 | 16490.551 | 22427.767 | 23451.886 | 21068.794 | 32960.381 | 9649.674 | 17538.432 | 38091.691 | 480.752 | 73.982 |
| 13687 - Sphingomonas | 4688.349 | 29033.744 | 12101.603 | 6543.925 | 17594.253 | 25613.385 | 28941.491 | 20731.628 | 31299.074 | 6169.392 | 6310.884 | 29744.999 | 13.303 | 1.084 |
| 37914 - Dietzia | 10825.673 | 52825.661 | 17149.516 | 8769.884 | 20165.245 | 28191.640 | 45763.874 | 15913.459 | 46163.752 | 8122.847 | 6675.907 | 40384.093 | NaN | NaN |
| 469 - Acinetobacter | 2772.345 | 11158.895 | 7052.537 | 9496.731 | 13154.365 | 23846.245 | 14577.205 | 80605.920 | 21943.776 | 15249.816 | 3400.060 | 15808.468 | 36.505 | 45.527 |
| 12916 - Acidovorax | 3195.614 | 11895.625 | 13657.993 | 4548.717 | 13717.947 | 19686.625 | 30035.517 | 10773.411 | 33610.055 | 5720.076 | 4023.829 | 25862.296 | 3.094 | NaN |
| 1386 - Bacillus | 2112.507 | 15043.006 | 7450.383 | 4710.447 | 7874.768 | 48486.421 | 12120.551 | 15452.481 | 17655.743 | 4005.104 | 4657.627 | 20831.152 | 1099.170 | 76.963 |
| 106589 - Cupriavidus | 2594.172 | 14180.917 | 12334.036 | 3906.569 | 11694.717 | 20391.026 | 26735.047 | 9139.243 | 30993.361 | 5864.085 | 3695.164 | 23304.578 | 9.745 | NaN |
| 1716 - Corynebacterium | 3982.049 | 12378.212 | 4941.618 | 5210.548 | 12241.310 | 13249.976 | 19116.029 | 7387.899 | 19589.483 | 4648.328 | 21728.329 | 9789.607 | 41.764 | 5.781 |
| 1678 - Bifidobacterium | 715.891 | 13496.158 | 10190.759 | 347.404 | 1395.201 | 405656.412 | 12225.569 | 9068.686 | 9712.665 | 333.832 | 189.792 | 1440.626 | 82.832 | 1746265.778 |
| 286 - Pseudomonas | 1579.691 | 7116.301 | 8640.791 | 2966.866 | 20818.262 | 31180.146 | 5227.350 | 215000.525 | 113899.851 | 48460.663 | 5460.869 | 8496.268 | 46.405 | 2.891 |
| 196013 - Caldimonas | 2271.286 | 8988.396 | 7712.293 | 2637.356 | 8077.819 | 10798.434 | 15713.642 | 5392.375 | 17120.724 | 3553.162 | 2544.259 | 14416.905 | 1.856 | NaN |
| 1827 - Rhodococcus | 2528.316 | 10710.003 | 5306.694 | 2767.728 | 8542.706 | 9914.901 | 15098.469 | 5647.391 | 15276.607 | 2576.179 | 3059.533 | 18296.921 | 20.418 | NaN |
| 57493 - Kocuria | 1031.744 | 15190.352 | 2149.453 | 9120.696 | 54524.200 | 31542.960 | 5599.527 | 6025.127 | 8896.870 | 2415.975 | 2836.470 | 7362.656 | 19.799 | 1.445 |
| 909656 - Phocaeicola | 197.497 | 6149.319 | 21086.145 | 540.917 | 910.719 | 919244.317 | 15336.056 | 179271.849 | 11252.007 | 165.238 | 344.481 | 1222.084 | 1.856 | NaN |
| 407 - Methylobacterium | 1188.605 | 5606.004 | 5145.645 | 2551.464 | 6594.710 | 14681.088 | 7832.803 | 5201.617 | 11522.001 | 2332.809 | 2805.802 | 14268.225 | 2.475 | 1.084 |
| 1301 - Streptococcus | 1968.220 | 69050.196 | 2013.929 | 2781.277 | 4284.027 | 95039.999 | 5335.541 | 10572.910 | 9364.673 | 3412.874 | 4222.012 | 54646.247 | 68.679 | 1.445 |
| 265 - Paracoccus | 903.442 | 4292.073 | 774.860 | 7220.583 | 10034.174 | 7797.582 | 1207.410 | 60205.409 | 2228.858 | 4524.892 | 10290.737 | 997.471 | 3.403 | 2.710 |
| 1883 - Streptomyces | 1273.358 | 8505.714 | 3106.110 | 3501.478 | 4198.816 | 7967.039 | 5357.828 | 3827.253 | 7735.333 | 2286.922 | 2236.425 | 7807.801 | 70.535 | 7.227 |
| 1279 - Staphylococcus | 2020.010 | 5626.945 | 1953.988 | 3297.654 | 2980.779 | 13478.350 | 4838.007 | 4716.617 | 7185.501 | 1506.838 | 2477.426 | 5648.654 | 146.174 | 9.033 |
| 816 - Bacteroides | 227.335 | 3974.156 | 9193.922 | 306.758 | 823.350 | 436335.983 | 5810.066 | 208378.933 | 4382.530 | 221.047 | 379.874 | 2475.217 | 1.237 | NaN |
| 572511 - Blautia | 468.309 | 6134.470 | 29231.111 | 737.754 | 1630.521 | 964184.112 | 20585.045 | 43023.591 | 20057.566 | 158.745 | 289.028 | 617.411 | 2.784 | 1.084 |
| 283 - Comamonas | 1886.167 | 4940.377 | 4192.446 | 782.234 | 2140.800 | 3336.158 | 6658.211 | 4255.640 | 5241.824 | 2053.327 | 1709.288 | 3851.952 | 2.166 | NaN |
| 1269 - Micrococcus | 1110.814 | 8954.701 | 2225.037 | 6058.480 | 9975.120 | 4722.578 | 2774.230 | 2128.828 | 2855.938 | 10248.032 | 2784.971 | 5304.719 | 10.828 | 1.084 |
| 57494 - Nesterenkonia | 1232.153 | 4312.252 | 3382.017 | 1572.989 | 4056.707 | 5007.053 | 9124.963 | 3493.616 | 8785.685 | 1613.714 | 1117.537 | 9788.213 | NaN | NaN |
N = 100
fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_mean_nonNA_annot.png', dpi=300)
fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_mean_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=False, cmap='Blues')
plt.title('log10 mean counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_mean_nonNA.png', dpi=300)
N = 100
fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_per_nonNA_annot.png', dpi=300)
fig, ax = plt.subplots(1, 1, figsize=(9, 22))
sns.heatmap(np.log10(df_per_raw_cutoffindex_nonNA.iloc[1:N, :]), yticklabels=True, annot=False, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
plt.savefig(f'{dir_diversity_output}/{today}/heatmap_per_nonNA.png', dpi=300)
## Try a quick wilcoxon test
from scipy.stats import mannwhitneyu, ttest_ind
For this part we can use the follwoign databases to get some insights https://mbodymap.microbiome.cloud/ | https://www.microbiomeatlas.org/
# RR [POOL 1-4] vs HC [POOL 8-12]
list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []
for row in range(len(df_mean_raw_cutoffindex_nonNA)):
condition_vals = df_mean_raw_cutoffindex_nonNA.iloc[row, :4].values
reference_vals = df_mean_raw_cutoffindex_nonNA.iloc[row, 8:12].values
res = mannwhitneyu(condition_vals, reference_vals)
list_pvals_mannwhitney.append(res.pvalue)
res = ttest_ind(condition_vals, reference_vals)
list_pvals_ttest.append(res.pvalue)
L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))
df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,2,3,8,9,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney
df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])
df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]
fig, ax = plt.subplots(1, 1, figsize=(9, 2))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
/tmp/ipykernel_2735545/465228851.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['log2FC'] = L2FC /tmp/ipykernel_2735545/465228851.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['pval_ttest'] = list_pvals_ttest /tmp/ipykernel_2735545/465228851.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['pval_MW'] = list_pvals_mannwhitney
| mean POOL1 | mean POOL2 | mean POOL3 | mean POOL4 | mean POOL9 | mean POOL10 | mean POOL11 | mean POOL12 | log2FC | pval_ttest | pval_MW | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | |||||||||||
| 53457 - Janibacter | 242.680 | 101.657 | 476.888 | 486.383 | 802.670 | 538.391 | 526.558 | 604.673 | -0.919 | 0.043 | 0.029 |
| 9605 - Homo | 1709929.998 | 1933932.908 | 1288302.626 | 1523916.538 | 1049918.995 | 1260704.944 | 1109145.249 | 1321520.166 | 0.445 | 0.030 | 0.057 |
| 125216 - Roseomonas | 174.337 | 458.600 | 707.427 | 485.701 | 2409.323 | 572.095 | 1047.715 | 1634.288 | -1.633 | 0.058 | 0.057 |
| 1696 - Brevibacterium | 334.182 | 519.995 | 905.362 | 1117.112 | 1698.713 | 1061.900 | 2918.636 | 1184.068 | -1.255 | 0.073 | 0.057 |
| 237 - Flavobacterium | 229.324 | 829.250 | 590.182 | 388.220 | 894.731 | 1398.285 | 391.157 | 1552.882 | -1.057 | 0.110 | 0.114 |
| 165696 - Novosphingobium | 210.782 | 770.711 | 370.016 | 277.105 | 1370.783 | 406.857 | 525.111 | 1396.340 | -1.184 | 0.129 | 0.114 |
| 165695 - Sphingobium | 759.582 | 2048.377 | 1756.712 | 506.663 | 3202.806 | 2728.723 | 818.769 | 2891.203 | -0.927 | 0.133 | 0.114 |
| 29580 - Janthinobacterium | 230.745 | 317.917 | 601.050 | 201.438 | 5777.125 | 635.856 | 271.187 | 736.236 | -2.457 | 0.292 | 0.114 |
| 286 - Pseudomonas | 1579.691 | 7116.301 | 8640.791 | 2966.866 | 113899.851 | 48460.663 | 5460.869 | 8496.268 | -3.118 | 0.174 | 0.200 |
| 469 - Acinetobacter | 2772.345 | 11158.895 | 7052.537 | 9496.731 | 21943.776 | 15249.816 | 3400.060 | 15808.468 | -0.888 | 0.181 | 0.200 |
| 1578 - Lactobacillus | 673.195 | 4523.563 | 821.709 | 1159.547 | 3442.707 | 1517.125 | 16327.927 | 2874.484 | -1.751 | 0.279 | 0.200 |
| 89966 - Hymenobacter | 248.647 | 3774.268 | 603.355 | 408.670 | 5480.882 | 719.606 | 727.344 | 2339.872 | -0.880 | 0.479 | 0.200 |
| 1716 - Corynebacterium | 3982.049 | 12378.212 | 4941.618 | 5210.548 | 19589.483 | 4648.328 | 21728.329 | 9789.607 | -1.072 | 0.154 | 0.343 |
| 1822464 - Paraburkholderia | 177.747 | 640.784 | 1001.859 | 388.902 | 1729.088 | 584.789 | 411.217 | 1560.048 | -0.956 | 0.220 | 0.343 |
| 497 - Psychrobacter | 193.519 | 255.476 | 673.505 | 414.124 | 1373.220 | 291.227 | 289.318 | 1429.082 | -1.138 | 0.221 | 0.343 |
# SP vs HC
list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []
for row in range(len(df_mean_raw_cutoffindex_nonNA)):
condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 4:8].values
reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 8:12].values
res = mannwhitneyu(condition_vals, reference_vals)
list_pvals_mannwhitney.append(res.pvalue)
res = ttest_ind(condition_vals, reference_vals)
list_pvals_ttest.append(res.pvalue)
L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))
df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [4,5,6,7,8,9,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney
df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])
df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]
fig, ax = plt.subplots(1, 1, figsize=(9, 3))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
/tmp/ipykernel_2735545/1233428652.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['log2FC'] = L2FC /tmp/ipykernel_2735545/1233428652.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['pval_ttest'] = list_pvals_ttest /tmp/ipykernel_2735545/1233428652.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['pval_MW'] = list_pvals_mannwhitney
| mean POOL5 | mean POOL6 | mean POOL7 | mean POOL8 | mean POOL9 | mean POOL10 | mean POOL11 | mean POOL12 | log2FC | pval_ttest | pval_MW | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | |||||||||||
| 338 - Xanthomonas | 1603.645 | 1703.963 | 2571.048 | 974.453 | 674.235 | 610.614 | 381.610 | 458.182 | 1.690 | 0.012 | 0.029 |
| 59732 - Chryseobacterium | 2593.373 | 3726.192 | 2432.708 | 3218.189 | 1947.708 | 557.213 | 499.169 | 2170.791 | 1.210 | 0.019 | 0.029 |
| 75 - Caulobacter | 982.627 | 1434.362 | 1276.652 | 4758.112 | 902.512 | 303.119 | 298.865 | 913.577 | 1.805 | 0.146 | 0.029 |
| 9605 - Homo | 1034480.997 | 570833.058 | 1050951.429 | 1048025.882 | 1049918.995 | 1260704.944 | 1109145.249 | 1321520.166 | -0.356 | 0.102 | 0.057 |
| 169133 - Citricoccus | 1091.029 | 1092.269 | 1554.486 | 587.645 | 2314.825 | 351.048 | 270.416 | 1788.143 | 4.077 | 0.279 | 0.057 |
| 572511 - Blautia | 1630.521 | 964184.112 | 20585.045 | 43023.591 | 20057.566 | 158.745 | 289.028 | 617.411 | 5.607 | 0.326 | 0.057 |
| 52972 - Polaromonas | 1054.176 | 627.578 | 1019.014 | 611.836 | 713.985 | 251.541 | 256.143 | 582.779 | 0.876 | 0.066 | 0.114 |
| 517 - Bordetella | 792.250 | 4083.736 | 1213.180 | 4430.690 | 945.261 | 511.253 | 283.145 | 1150.032 | 1.864 | 0.096 | 0.114 |
| 1839 - Nocardioides | 3947.496 | 4966.548 | 4746.983 | 2615.424 | 3319.241 | 1035.856 | 1979.511 | 3760.594 | 0.689 | 0.108 | 0.114 |
| 1485 - Clostridium | 1169.229 | 135619.091 | 6065.036 | 218546.808 | 3276.679 | 609.885 | 620.876 | 1410.372 | 5.932 | 0.144 | 0.114 |
| 816 - Bacteroides | 823.350 | 436335.983 | 5810.066 | 208378.933 | 4382.530 | 221.047 | 379.874 | 2475.217 | 6.448 | 0.170 | 0.114 |
| 909656 - Phocaeicola | 910.719 | 919244.317 | 15336.056 | 179271.849 | 11252.007 | 165.238 | 344.481 | 1222.084 | 6.424 | 0.252 | 0.114 |
| 665874 - Limnohabitans | 747.487 | 1854.432 | 2146.507 | 289.287 | 3800.168 | 1221.228 | 1971.217 | 3390.386 | -1.043 | 0.124 | 0.200 |
| 165695 - Sphingobium | 4139.761 | 3742.727 | 2614.973 | 46364.752 | 3202.806 | 2728.723 | 818.769 | 2891.203 | 2.560 | 0.314 | 0.200 |
| 1678 - Bifidobacterium | 1395.201 | 405656.412 | 12225.569 | 9068.686 | 9712.665 | 333.832 | 189.792 | 1440.626 | 5.197 | 0.336 | 0.200 |
# RR vs SP
list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []
for row in range(len(df_mean_raw_cutoffindex_nonNA)):
condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, :4].values
reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, 4:8].values
res = mannwhitneyu(condition_vals, reference_vals)
list_pvals_mannwhitney.append(res.pvalue)
res = ttest_ind(condition_vals, reference_vals)
list_pvals_ttest.append(res.pvalue)
L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))
df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,2,3,4,5,6,7]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney
df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])
df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]
fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
/tmp/ipykernel_2735545/248738686.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['log2FC'] = L2FC /tmp/ipykernel_2735545/248738686.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['pval_ttest'] = list_pvals_ttest /tmp/ipykernel_2735545/248738686.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['pval_MW'] = list_pvals_mannwhitney
| mean POOL1 | mean POOL2 | mean POOL3 | mean POOL4 | mean POOL5 | mean POOL6 | mean POOL7 | mean POOL8 | log2FC | pval_ttest | pval_MW | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | |||||||||||
| 9605 - Homo | 1709929.998 | 1933932.908 | 1288302.626 | 1523916.538 | 1034480.997 | 570833.058 | 1050951.429 | 1048025.882 | 0.801 | 0.009 | 0.029 |
| 165696 - Novosphingobium | 210.782 | 770.711 | 370.016 | 277.105 | 1103.073 | 1689.090 | 1261.506 | 2477.333 | -2.004 | 0.010 | 0.029 |
| 1763 - Mycobacterium | 653.232 | 1037.895 | 795.609 | 517.911 | 1171.656 | 1748.223 | 1818.039 | 1193.434 | -0.981 | 0.012 | 0.029 |
| 469 - Acinetobacter | 2772.345 | 11158.895 | 7052.537 | 9496.731 | 13154.365 | 23846.245 | 14577.205 | 80605.920 | -2.117 | 0.166 | 0.029 |
| 29404 - Microlunatus | 241.543 | 678.763 | 574.291 | 535.805 | 1914.559 | 23500.181 | 1537.464 | 1775.031 | -3.823 | 0.266 | 0.029 |
| 165695 - Sphingobium | 759.582 | 2048.377 | 1756.712 | 506.663 | 4139.761 | 3742.727 | 2614.973 | 46364.752 | -3.487 | 0.273 | 0.029 |
| 52972 - Polaromonas | 100.382 | 626.792 | 341.363 | 203.483 | 1054.176 | 627.578 | 1019.014 | 611.836 | -1.381 | 0.022 | 0.057 |
| 33882 - Microbacterium | 659.412 | 2986.042 | 1860.208 | 1189.967 | 3473.980 | 4616.946 | 4375.672 | 2507.824 | -1.161 | 0.025 | 0.057 |
| 41275 - Brevundimonas | 708.218 | 3265.886 | 1339.765 | 2465.827 | 2999.655 | 4858.462 | 4968.558 | 3455.313 | -1.065 | 0.031 | 0.057 |
| 84567 - Pedobacter | 138.674 | 359.418 | 362.606 | 390.606 | 602.592 | 566.640 | 384.005 | 526.159 | -0.733 | 0.034 | 0.057 |
| 407 - Methylobacterium | 1188.605 | 5606.004 | 5145.645 | 2551.464 | 6594.710 | 14681.088 | 7832.803 | 5201.617 | -1.243 | 0.080 | 0.057 |
| 1866885 - Mycolicibacterium | 920.492 | 924.340 | 1120.176 | 379.102 | 1922.648 | 2701.937 | 1145.741 | 1002.424 | -1.018 | 0.089 | 0.057 |
| 75 - Caulobacter | 138.532 | 1170.773 | 482.816 | 353.454 | 982.627 | 1434.362 | 1276.652 | 4758.112 | -1.978 | 0.135 | 0.057 |
| 44249 - Paenibacillus | 218.241 | 818.209 | 1700.724 | 221.548 | 897.416 | 34718.108 | 1746.922 | 3942.162 | -2.427 | 0.221 | 0.057 |
| 125216 - Roseomonas | 174.337 | 458.600 | 707.427 | 485.701 | 5673.565 | 2165.477 | 703.673 | 27898.531 | -4.319 | 0.222 | 0.057 |
# SEX
list_pvals_mannwhitney = []
list_pvals_ttest = []
L2FC = []
for row in range(len(df_mean_raw_cutoffindex_nonNA)):
condition_vals = df_per_raw_cutoffindex_nonNA.iloc[row, [2,3, 6,7, 10,11]].values #FEMALE
reference_vals = df_per_raw_cutoffindex_nonNA.iloc[row, [0,1, 4,5, 8,9]].values #MALE
res = mannwhitneyu(condition_vals, reference_vals)
list_pvals_mannwhitney.append(res.pvalue)
res = ttest_ind(condition_vals, reference_vals)
list_pvals_ttest.append(res.pvalue)
L2FC.append(np.log2(condition_vals.mean() / reference_vals.mean()))
df_pval = df_mean_raw_cutoffindex_nonNA.iloc[:, [0,1,4,5,8,9,2,3,6,7,10,11]]
df_pval['log2FC'] = L2FC
df_pval['pval_ttest'] = list_pvals_ttest
df_pval['pval_MW'] = list_pvals_mannwhitney
df_pval = df_pval.sort_values(by=['pval_MW', 'pval_ttest'])
display(df_pval.iloc[:15])
df_pval_pos = df_pval[(df_pval['pval_MW'] < 0.05)]
try:
fig, ax = plt.subplots(1, 1, figsize=(9, 5))
sns.heatmap(np.log10(df_pval_pos.iloc[:, :-3]), yticklabels=True, annot=True, cmap='Blues')
plt.title('log10 percentage counts')
plt.tight_layout()
except:
print('NO SIGNIFICANT SAMPLES')
/tmp/ipykernel_2735545/3247866785.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_pval['log2FC'] = L2FC
| mean POOL1 | mean POOL2 | mean POOL5 | mean POOL6 | mean POOL9 | mean POOL10 | mean POOL3 | mean POOL4 | mean POOL7 | mean POOL8 | mean POOL11 | mean POOL12 | log2FC | pval_ttest | pval_MW | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| taxon - genus | |||||||||||||||
| 57495 - Dermacoccus | 305.481 | 5917.258 | 13471.751 | 6928.560 | 4459.404 | 2394.527 | 423.370 | 3007.000 | 1343.009 | 1635.176 | 3198.117 | 2476.212 | -1.470 | 0.091 | 0.180 |
| 2034 - Curtobacterium | 287.578 | 1146.786 | 714.049 | 1274.796 | 1256.223 | 301.441 | 400.480 | 268.243 | 562.881 | 382.020 | 288.160 | 751.960 | -0.908 | 0.085 | 0.240 |
| 2053 - Gordonia | 695.644 | 2967.767 | 3917.834 | 4503.012 | 2424.041 | 483.676 | 683.715 | 935.358 | 1951.547 | 1333.037 | 644.310 | 2131.382 | -0.965 | 0.122 | 0.240 |
| 1269 - Micrococcus | 1110.814 | 8954.701 | 9975.120 | 4722.578 | 2855.938 | 10248.032 | 2225.037 | 6058.480 | 2774.230 | 2128.828 | 2784.971 | 5304.719 | -0.832 | 0.145 | 0.240 |
| 149698 - Massilia | 353.789 | 6106.866 | 2303.672 | 1937.320 | 86421.476 | 4726.679 | 1326.179 | 517.058 | 1073.470 | 10077.409 | 707.381 | 1935.530 | -2.703 | 0.329 | 0.240 |
| 42255 - Rubrobacter | 624.033 | 2149.939 | 1150.892 | 3435.797 | 2733.972 | 256.283 | 770.414 | 215.242 | 1529.386 | 963.869 | 559.830 | 1344.093 | -0.943 | 0.162 | 0.310 |
| 28067 - Rubrivivax | 1881.478 | 4131.116 | 1832.583 | 4063.519 | 4614.932 | 1775.669 | 2935.429 | 718.326 | 4460.710 | 1477.345 | 1070.475 | 2922.551 | -0.430 | 0.350 | 0.310 |
| 57493 - Kocuria | 1031.744 | 15190.352 | 54524.200 | 31542.960 | 8896.870 | 2415.975 | 2149.453 | 9120.696 | 5599.527 | 6025.127 | 2836.470 | 7362.656 | -1.779 | 0.146 | 0.394 |
| 316612 - Methylibium | 216.110 | 2643.853 | 1197.633 | 2133.420 | 2423.573 | 375.779 | 928.992 | 327.209 | 2010.186 | 434.434 | 336.476 | 1528.600 | -0.692 | 0.296 | 0.394 |
| 391952 - Aquincola | 951.821 | 4937.236 | 1132.556 | 2768.218 | 2224.077 | 876.308 | 3005.578 | 545.008 | 1318.198 | 520.111 | 832.077 | 2486.363 | -0.566 | 0.387 | 0.394 |
| 338 - Xanthomonas | 374.036 | 2138.231 | 1603.645 | 1703.963 | 674.235 | 610.614 | 512.292 | 339.224 | 2571.048 | 974.453 | 381.610 | 458.182 | -0.440 | 0.513 | 0.394 |
| 2897311 - Fulvia | 303.918 | 989.160 | 550.638 | 830.466 | 1007.416 | 267.445 | 471.783 | 289.887 | 612.793 | 398.651 | 440.920 | 711.754 | -0.433 | 0.279 | 0.485 |
| 590 - Salmonella | 131.286 | 937.761 | 675.219 | 1392.918 | 2223.327 | 690.424 | 378.414 | 331.981 | 797.150 | 13920.034 | 201.365 | 992.396 | -0.580 | 0.357 | 0.485 |
| 165695 - Sphingobium | 759.582 | 2048.377 | 4139.761 | 3742.727 | 3202.806 | 2728.723 | 1756.712 | 506.663 | 2614.973 | 46364.752 | 818.769 | 2891.203 | 1.725 | 0.412 | 0.485 |
| 561 - Escherichia | 234.723 | 2897.806 | 1398.257 | 3607.204 | 3752.919 | 1138.062 | 655.062 | 637.035 | 1080.178 | 5030.515 | 411.988 | 1640.060 | -0.463 | 0.535 | 0.485 |
NO SIGNIFICANT SAMPLES